In [1]:
import pandas as pd, numpy as np
import kendo_romania
Read data
In [2]:
matches={2018:{},2017:{},2016:{},2015:{},2014:{}}
In [3]:
filename='rawdata/2018/CR/CR25 - Public.xlsx'
sheetname='List of matches'
column_keys={'match_type':2,'aka':{'name':5,'hansoku':6,'point1':7,'point2':8,'point3':9},
'shiro':{'name':15,'hansoku':14,'point1':11,'point2':12,'point3':13},'outcome':10,
'shinpan':{'fukushin1':16,'shushin':17,'fukushin2':18}}
matches[2018]['CR']=kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3)
In [4]:
filename='rawdata/2018/SL/Prezenta SL_WKC17.xlsx'
sheetname=['F','M']
matches[2018]['SL']=kendo_romania.get_matches_from_table(filename,sheetname,5)
In [5]:
categories=['Individual masculin','Echipe']
filename=['rawdata/2017/CN/'+i+'.xlsx' for i in categories]
sheetname='List of matches'
column_keys={'match_type':2,'aka':{'name':5,'hansoku':6,'point1':7,'point2':8,'point3':9},
'shiro':{'name':15,'hansoku':14,'point1':11,'point2':12,'point3':13},'outcome':10,
'shinpan':{'fukushin1':16,'shushin':17,'fukushin2':18}}
shift=0
matches[2017]['CN']=kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3,shift=shift)
In [6]:
categories=['Individual juniori mici','Individual juniori mari','Individual feminin']
filename=['rawdata/2017/CN/'+i+'.xlsx' for i in categories]
shift=-1
matches[2017]['CN']=matches[2017]['CN']+\
kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3,shift=shift)
In [7]:
categories=['Individual masculin']
filename=['rawdata/2017/CR/'+i+'.xlsx' for i in categories]
sheetname='List of matches'
column_keys={'match_type':2,'aka':{'name':5,'hansoku':6,'point1':7,'point2':8,'point3':9},
'shiro':{'name':15,'hansoku':14,'point1':11,'point2':12,'point3':13},'outcome':10}
shift=2
matches[2017]['CR']=kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3,shift=shift)
In [8]:
categories=['Individual juniori','Individual veterani','Individual feminin']
filename=['rawdata/2017/CR/'+i+'.xlsx' for i in categories]
shift=-1
matches[2017]['CR']=matches[2017]['CR']+\
kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3,shift=shift)
In [9]:
categories=['Echipe']
filename=['rawdata/2017/CR/'+i+'.xlsx' for i in categories]
shift=0
matches[2017]['CR']=matches[2017]['CR']+\
kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3,shift=shift)
In [10]:
filename='rawdata/2017/SL/Prezenta.xlsx'
sheetname=['F','M','J']
matches[2017]['SL']=kendo_romania.get_matches_from_table(filename,sheetname,6)
In [11]:
filename='rawdata/2016/SL/Event management - stagiul 4.xlsx'
sheetname=['F','M']
matches[2016]['SL']=kendo_romania.get_matches_from_table(filename,sheetname,6)
In [12]:
sheetname=['J']
matches[2016]['SL']=matches[2016]['SL']+\
kendo_romania.get_matches_from_table(filename,sheetname,5)
In [13]:
categories=['Individual masculin']
filename=['rawdata/2016/CN/'+i+'.xlsx' for i in categories]
sheetname='List of matches'
column_keys={'match_type':2,'aka':{'name':5,'hansoku':6,'point1':7,'point2':8,'point3':9},
'shiro':{'name':15,'hansoku':14,'point1':11,'point2':12,'point3':13},'outcome':10}
shift=2
matches[2016]['CN']=kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3,shift=shift)
In [14]:
categories=['Individual feminin']
filename=['rawdata/2016/CN/'+i+'.xlsx' for i in categories]
shift=-1
matches[2016]['CN']=matches[2016]['CN']+\
kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3,shift=shift)
In [15]:
categories=['Echipe','Male team']
filename=['rawdata/2016/CN/'+i+'.xlsx' for i in categories]
shift=0
matches[2016]['CN']=matches[2016]['CN']+\
kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3,shift=shift)
In [16]:
categories=['Junior 1 individual','Junior 2 individual']
filename=['rawdata/2016/CN/'+i+'.xlsx' for i in categories]
shift=-1
matches[2016]['CN']=matches[2016]['CN']+\
kendo_romania.get_matches_from_list(filename,sheetname,column_keys,3,shift=shift)
In [17]:
filename='rawdata/2016/CR/Event management_CR23.2016.xlsx'
sheetname=['IF_m','IJ_m','IM_m','IS_m']
column_keys={'match_type':0,'aka':{'name':1,'point1':2},
'shiro':{'name':6,'point1':5},'outcome':3,
'shinpan':{'fukushin1':7,'shushin':8,'fukushin2':9}}
shift=0
matches[2016]['CR']=kendo_romania.get_matches_from_list(filename,sheetname,column_keys,4,shift=shift)
In [18]:
sheetname=['EJ_m','ES_m']
matches[2016]['CR']=matches[2016]['CR']+\
kendo_romania.get_matches_from_list(filename,sheetname,column_keys,6,shift=shift)
In [19]:
filename='rawdata/2014/SL/Lista de participanti 6.xlsx'
sheetname=['SF_s','SM_s','J_s']
matches[2014]['SL']=kendo_romania.get_matches_from_table(filename,sheetname,6)
In [20]:
filename='rawdata/2015/SL/Event management - stagiul 5.xlsx'
sheetname=['SF_s','SM_s']
matches[2015]['SL']=kendo_romania.get_matches_from_table(filename,sheetname,6)
In [21]:
filename='rawdata/2015/CN/Event management_CN22.2015.xlsx'
sheetname=['IF_m','IJ2_m','IM_m']
column_keys={'match_type':0,'aka':{'name':1,'point1':2},
'shiro':{'name':6,'point1':5},'outcome':3,
'shinpan':{'fukushin1':7,'shushin':8,'fukushin2':9}}
shift=0
matches[2015]['CN']=kendo_romania.get_matches_from_list(filename,sheetname,column_keys,4,shift=shift)
In [22]:
sheetname='E_m'
matches[2015]['CN']=matches[2015]['CN']+\
kendo_romania.get_matches_from_list(filename,sheetname,column_keys,6,shift=shift)
In [23]:
filename='rawdata/2015/CR/Event management_CR22.2015.xlsx'
sheetname=['IF_m','IS_m']
column_keys={'match_type':0,'aka':{'name':1,'point1':2},
'shiro':{'name':6,'point1':5},'outcome':3,
'shinpan':{'fukushin1':7,'shushin':8,'fukushin2':9}}
shift=0
matches[2015]['CR']=kendo_romania.get_matches_from_list(filename,sheetname,column_keys,4,shift=shift)
In [32]:
matches[2015]['CR'][0]
Out[32]:
Clean up points, matches, player names
In [701]:
def match_cleaner(match):
kind,phase='Unknown','Unknown'
if '#' in match:
stage0=match.split('#')[0].lower()
stage1=match.split('#')[1]
if 'pool' in stage1:
phase='Pool'
if 'Pool' in stage1:
phase='Pool'
elif 'prel' in stage1:
phase='Prelim.'
elif 'Prel' in stage1:
phase='Prelim.'
elif 'layoff' in stage1:
phase='Prelim.'
elif '- F' in stage1:
phase='Finals'
elif 'F -' in stage1:
phase='Finals'
elif 'Final' in stage1:
phase='Finals'
elif 'SF' in stage1:
phase='Finals'
elif 'QF' in stage1:
phase='Finals'
elif 'A'==stage1: phase='Pool'
elif 'B'==stage1: phase='Pool'
elif 'C'==stage1: phase='Pool'
elif 'D'==stage1: phase='Pool'
elif 'E'==stage1: phase='Pool'
elif 'F'==stage1: phase='Pool'
elif 'G'==stage1: phase='Pool'
elif 'H'==stage1: phase='Pool'
elif 'I'==stage1: phase='Pool'
elif 'J'==stage1: phase='Pool'
elif 'K'==stage1: phase='Pool'
elif 'L'==stage1: phase='Pool'
elif 'M'==stage1: phase='Pool'
elif 'N'==stage1: phase='Pool'
elif 'O'==stage1: phase='Pool'
elif 'P'==stage1: phase='Pool'
elif 'Q'==stage1: phase='Pool'
elif 'R'==stage1: phase='Pool'
elif 'S'==stage1: phase='Pool'
elif 'T'==stage1: phase='Pool'
if 'IS' in stage1:
kind="Senior's Individual"
elif 'IF' in stage1:
kind="Women's Individual"
elif 'IM' in stage1:
kind="Men's Individual"
elif 'IC' in stage1:
kind="Children's Individual"
elif 'IJ' in stage1:
kind="Junior's Individual"
elif 'EJ' in stage1:
kind="Junior's Team"
elif 'EF' in stage1:
kind="Men's Team"
elif 'ES' in stage1:
kind="Senior's Team"
if 'individual masculin.' in stage0:
kind="Men's Individual"
if 'echipe.' in stage0:
kind="Mixed Team"
if 'individual juniori' in stage0:
kind="Junior's Team"
if 'individual feminin' in stage0:
kind="Junior's Team"
if 'individual veterani' in stage0:
kind="Senior's Team"
if 'male team' in stage0:
kind="Men's Team"
if 'junior 1 individual' in stage0:
kind="Junior's Individual"
if 'junior 2 individual' in stage0:
kind="Junior's Individual"
elif match=='F':
kind="Women's Individual"
elif match=='M':
kind="Men's Individual"
elif match=='J':
kind="Junior's Individual"
elif match=='SF_s':
kind="Women's Individual"
elif match=='SM_s':
kind="Men's Individual"
elif match=='J_s':
kind="Junior's Individual"
return kind,phase
In [702]:
name_exceptions={'Atanasovski':'Atanasovski X. (XXX)',
'Dobrovicescu (SON)':'Dobrovicescu M. (SON)'}
def name_cleaner(name):
if name not in name_exceptions:
nc=name.replace(' ',' ').split('(')
else:
nc=name_exceptions[name].split('(')
if len(nc)<2:
nc=[nc[0],'XXX']
rname=nc[0].strip()
rclub=nc[1].replace('(','').replace(')','').strip()
rnames=rname.split(' ')
sname=rnames[0]+' '+rnames[1][0]+'.'
return sname, rclub
In [703]:
name_exceptions={'Atanasovski':'Atanasovski A. (MAC)',
'Dobrovicescu (SON)':'Dobrovicescu M. (SON)',
'Ianăș':'Ianăș F.',
'Arabadjiyski': 'Arabadjiyski A.',
'Mandia':'Mandia F.',
'Stanev':'Stanev A.',
'Mochalov':'Mochalov O.',
'Sozzi':'Sozzi A.',
'Crăciunel':'Crăciunel I.',
'Craciunel':'Crăciunel I.',
'Sagaev':'Sagaev L.',
'Buzás':'Buzás C.',
'Csala':'Csala D.',
'Dimitrov':'Dimitrov M.',
'Józsa':'Józsa L.',
'Creangă':'Creangă A.',
'Duțescu':'Duțescu M.',
'Furtună':'Furtună G.',
'Gârbea':'Gârbea I.',
'Stupu':'Stupu I.',
'Mahika-Voiconi':'Mahika-Voiconi S.',
'Mahika':'Mahika-Voiconi S.',
'Stanciu':'Stanciu F.',
'Vrânceanu':'Vrânceanu R.',
'Luca':'Luca M.',
'Wolfs':'Wolfs J.',
'Ducarme':'Ducarme A.',
'Sbârcea':'Sbârcea B.',
'Mocian':'Mocian A.',
'Hatvani':'Hatvani L.',
'Dusan':'Dusan N.',
'Borota':'Borota V.',
'Tsushima':'Tsushima K.',
'Tráser':'Tráser T.',
'Colțea':'Colțea A.',
'Brîcov':'Brîcov A.'}
redflags_names=['-','—','—',np.nan,'. ()','— ','- -.','- -. (-)',
'Kashi','Sankon','București','Victorii:','Sakura','Taiken','Ikada','Sonkei','CRK','Museido',
'Ichimon','Bushi Tokukai 1','Competitori – Shiai-sha','Echipa - roşu','Numele şi prenumele',
'Victorii:','Victorii: 0','Victorii: 1','Victorii: 2','Victorii: 3','Victorii: 4',
'Victorii: 5','?']
name_equals={'Chirea A.':'Chirea M.',
'Ghinet C.':'Ghineț C.',
'Domnița M.':'Domniță M.',
'Garbea I.':'Gârbea I.',
'Horvát M.':'Horváth M.',
'Ionita A.':'Ioniță A.',
'Medvedschi I.':'Medvețchi I.',
'Mahika S.':'Mahika-Voiconi S.',
'Mate L.':'Máté L.',
'Stupu A.':'Stupu I.'}
letter_norm={'ţ':'ț','ş':'ș'}
def name_cleaner(name):
for letter in letter_norm:
name=name.replace(letter,letter_norm[letter])
if name not in name_exceptions:
nc=name.replace(' ',' ').split('(')
else:
nc=name_exceptions[name].split('(')
rname=nc[0].strip()
rnames=rname.split(' ')
sname=rnames[0]+' '+rnames[1][0]+'.'
if sname in name_equals:
sname=name_equals[sname]
return sname
Standardize names
In [704]:
all_players={}
all_players_r={}
all_players_unsorted=set()
for year in matches:
for competition in matches[year]:
for match in matches[year][competition]:
for color in ['aka','shiro']:
name=match[color]['name']
all_players_unsorted.add(name)
if name not in redflags_names:
name=name_cleaner(name)
rname=match[color]['name']
if rname not in all_players_r:all_players_r[rname]=name
if name not in all_players: all_players[name]={}
if year not in all_players[name]:all_players[name][year]={'names':set()}
all_players[name][year]['names'].add(rname)
In [705]:
all_shinpan={}
all_shinpan_r={}
all_shinpan_unsorted=set()
for year in matches:
for competition in matches[year]:
for match in matches[year][competition]:
if 'shinpan' in match:
for color in ['fukushin1','shushin','fukushin2']:
aka=match['aka']['name']
shiro=match['shiro']['name']
if (aka not in redflags_names) and\
(shiro not in redflags_names) and\
(name_cleaner(aka) in all_players) and\
(name_cleaner(shiro) in all_players):
rname=match['shinpan'][color]
all_shinpan_unsorted.add(rname)
if (rname not in redflags_names):
name=name_cleaner(rname)
if name not in all_shinpan: all_shinpan[name]=[]
all_shinpan[name].append(match)
if rname not in all_shinpan_r:all_shinpan_r[rname]=name
Infer clubs
In [706]:
#naive infer
redflags_clubs=['','N/A','RO1','RO2']
club_equals={'MLD':'MOL',
'IKD':'IKA',
'BUL':'BUL/Bg',
'TUR':'TUR/Tr',
'MAC':'MAC/Mc',
'MNE':'MNE/Mn',
'SRB':'SRB/Sr',
'ITA':'ITA/It',
'Musso, Bg':'MUS/Bg',
'Makoto, Sr':'MAK/Sr',
'Szeged, Hu':'SZE/Hu'}
for name in all_players:
for year in all_players[name]:
for name_form in all_players[name][year]['names']:
if '(' in name_form:
club=name_form.split('(')[1].strip()[:-1]
if club in club_equals: club=club_equals[club]
if club not in redflags_clubs:
all_players[name][year]['club']=club
In [707]:
for name in all_players:
for year in all_players[name]:
if 'club' not in all_players[name][year]:
#more than 1 year?
years=np.sort(list(all_players[name].keys()))
if len(years)>1:
#get club from previous year
for y in range(years[0],year):
if y in all_players[name]:
if 'club' in all_players[name][y]:
all_players[name][year]['club']=all_players[name][y]['club']
#if still not found, get club from next year
if 'club' not in all_players[name][year]:
#get club from next year
for y in np.arange(years[-1],year,-1):
if y in all_players[name]:
if 'club' in all_players[name][y]:
all_players[name][year]['club']=all_players[name][y]['club']
In [708]:
clubs_manual={'Balázs-Kercsó Z.':'BTK',
'Vrânceanu M.':'SAN',
'Duțescu M.':'IKA',
'Crăciun D.':'SAM',
'Nagy V.':'ISS',
'Goró L.':'BTK',
'Ghineț G.':'YUK',
'Cioată E.':'KAS',
'Perianu S.':'KNS'}
for name in all_players:
for year in all_players[name]:
if 'club' not in all_players[name][year]:
#if still not found, print error, infer other way
if name in clubs_manual:
all_players[name][year]['club']=clubs_manual[name]
else:
print('error',name,year,all_players[name])
all_players[name][year]['club']='XXX'
In [709]:
clubs={}
for name in all_players:
for year in all_players[name]:
club=all_players[name][year]['club']
if club not in clubs:clubs[club]={}
if year not in clubs[club]:clubs[club][year]=set()
clubs[club][year].add(name)
In [710]:
def outcome_cleaner(outcome):
if outcome=='E': return True
else: return False
In [711]:
def outcome_from_points(aka,shiro):
if aka==shiro: return 'X',0
elif aka>shiro: return 'A',str(aka-shiro)
else: return 'S',str(shiro-aka)
In [712]:
redflags_points=['Puncte']
def points_cleaner(points):
hansoku=0
if '∆' in points:
hansoku=1
points=points.replace('∆','')
if len(points)>2:
print(points,'error')
elif len(points)>1:
point1=points[0]
point2=points[1]
elif len(points)>0:
point1=points[0]
point2=''
else:
point1=''
point2=''
return point1,point2,len(points),hansoku
In [713]:
def club_cleaner(club):
if '/' in club:
return club.split('/')[0],club.split('/')[1].upper()
else:
return club,'RO'
In [714]:
pretty_clubs={'ARA':'Arashi', 'BSD':'Bushido', 'BTK':'Bushi Tokukai', 'BG':'Bulgaria',
'CDO':'Coroan de Oțel', 'CRK':'Clubul Român de Kendo', 'HAR':'Hargita',
'ICH':'Ichimon', 'IKA':'Ikada','ISS':'Ishhin', 'IT':'Italy','HU':'Hungary',
'KAS':'Kashi', 'KNS':'Kenshin', 'KYO':'Kyobukan', 'MC':'Macedonia',
'SR':'Serbia', 'MN':'Montenegro', 'MOL':'Moldova', 'MUS':'Museido',
'RON':'Ronin-do', 'SAK':'Sakura', 'SAM':'Sam-sho','SAN':'Sankon', 'SBK':'Sobukan',
'SON':'Sonkei', 'SR':'Serbia', 'TAI':'Taiken', 'TR':'Turkey', 'XXX':'Unknown',
'YUK':'Yu-kai'}
def pretty_club(club, country):
if country!='RO':
return pretty_clubs[country]
else: return pretty_clubs[club]
In [715]:
master_matches=[]
for year in matches:
for competition in matches[year]:
print(year,competition)
for k in matches[year][competition]:
good=True
match={'year':year,'competition':competition}
match['match_kind'],match['match_phase']=match_cleaner(k['match_type'])
if 'shinpan' in k:
for color in ['fukushin1','shushin','fukushin2']:
if color in k['shinpan']:
if k['shinpan'][color] in all_shinpan_r:
match[color]=all_shinpan_r[k['shinpan'][color]]
for a in ['aka','shiro']:
points=''
for h in k[a]:
if h=='name':
name=k[a][h]
if name in all_players_r:
match[a+' name']=all_players_r[name]
club, country=club_cleaner(all_players[match[a+' name']][year]['club'])
match[a+' club'], match[a+' country']=club, country
match[a+' pretty_club']=pretty_club(club, country)
else:
good=False
else:
point=k[a][h]
if str(point)=='nan': point=''
points=points+point
for redflag in redflags_points:
if redflag in points:
good=False
if good:
match[a+' point1'],match[a+' point2'],match[a+' points'],match[a+' hansoku']=points_cleaner(points)
if good:
if 'outcome' in k:
match['encho']=outcome_cleaner(k['outcome'])
else:
match['encho']=False
match['winner'],match['difference']=outcome_from_points(match['aka points'],match['shiro points'])
master_matches.append(match)
In [716]:
data=pd.DataFrame(master_matches)
Cleanup
In [717]:
data.to_csv('data/matches.csv')
Group by player
In [718]:
aka=data[[i for i in data.columns if 'shiro ' not in i]]
aka.columns=[i.replace('aka ','') for i in aka.columns]
aka['color']='aka'
In [719]:
shiro=data[[i for i in data.columns if 'aka ' not in i]]
shiro.columns=[i.replace('shiro ','') for i in shiro.columns]
shiro['color']='shiro'
In [720]:
extended_matches=pd.concat([aka,shiro],axis=0).reset_index(drop=True)
In [721]:
extended_matches.to_csv('data/extended_matches.csv')
In [722]:
p1=extended_matches[[i for i in extended_matches.columns if i!='point2']]
p2=extended_matches[[i for i in extended_matches.columns if i!='point1']]
p1.rename(columns={'point1':'point'}, inplace=True)
p2.rename(columns={'point2':'point'}, inplace=True)
In [723]:
extended_points=pd.concat([p1,p2],axis=0).reset_index(drop=True)
In [724]:
extended_points.to_csv('data/extended_points.csv')
In [725]:
extended_points
Out[725]:
Competitor statistics
In [726]:
competitors={}
for i in data.T.iteritems():
for a in ['aka ','shiro ']:
name=i[1][a+'name']
club=i[1][a+'club']
if name not in competitors:
competitors[name]={'U':0,'club':club}
for j in ['point1','point2']:
point=i[1][a+j]
if point!='':
if point not in competitors[name]:competitors[name][point]=0
competitors[name][point]+=1
competitors[name]['U']+=1
In [727]:
data2=pd.DataFrame(competitors)
In [728]:
data2.T.to_csv('data/competitors.csv')